import re
import sys
import os
import country_list
import unidecode

import pandas as pd

from config import PATH

def make_country_list():
    """Generate a list of all countries, their names, and their abbreviations"""
    countries = set()
    # add all countries in English, French, and German
    for lang in ['en','fr','de']:
        d = dict(country_list.countries_for_language(lang)).values()
        countries.update(set(d))
        
    # for all other languages, add only country where this is spoken
    for lang in country_list.available_languages():
        if "_" in lang:
            continue
        d = dict(country_list.countries_for_language(lang))

        if not lang.upper() in d:
            continue
        else:
            example = d[lang.upper()]
            
        try:
            example.encode("ascii")
            ascii_representable = True
        except:
            ascii_representable = False
            
        if ascii_representable:
            countries.update(set(d))

    return sorted(list(countries))
        

class NameNormalizer(object):
    """A class to normalize company names by stripping from them country names as well as legal entity designations (such as LLC, GmbH, etc.)"""
    def __init__(self):
        countries = list(map(lambda s:s.lower(),make_country_list()))
        corps = ["gmbh","gesmbh","gesellschaft mbh","gesellschaft","eg","i gr","sas?","sarl","sagl","inc","corporation","incorporated","llc","pty", "ltd","limited","ltee","proprietary","kg","and co","co","ltda","bv","bvba","sprl","as","ag","ky","oy","ab","sl","societe par actions simplifiee","societe anonyme","sro","kft","z","oo","o","uk","liability"]
        
        corps.extend(["company","corp","mbh","ve","holdings?","group","services","ograniczona","odpowiedzial\w+","przedsiebiorstwo","zakrytoe","aps","coltd","nv","obshchestvo","obshhestvo","ogranichennoj","otvetstvennost'ju","otkrytoe","associates","plc","societe","ticaret"])
        corps.extend(["north america","south america","north africa","south africa", "american?","europea?n?","asia","africa","australia","global","international","anonim","anonyme","sirketi"])
        corps.extend(["bhd","national","spolka","firma","limitee","responsabilite","aktiengesellschaft","cokg","aktsionernoe","spaaka","spoaka","ooo","cie"])

        corps.extend(["\\w{1,3}"])
        self.regex = []

        for r in countries + corps:
            self.regex.append(re.compile('\\s\\(?{}\\)?\\s'.format(r)))

        self.regex_spaces = re.compile('\\s{2,}')
    
    def normalize(self,name):
        name = unidecode.unidecode(name.lower())
        name = name.replace(".","")
        name = name.replace(",","")
        name = name.replace("/","")
        name = name.replace(")","")
        name = name.replace("(","")
        name = name.replace("&","and")
        name = " " + name + " "

        for r in self.regex:
            name = r.sub(' ',name)

        name = self.regex_spaces.sub(' ',name)
        name = name.strip()
        return name

    instance = None

    @staticmethod
    def getInstance():
        if not NameNormalizer.instance:
            NameNormalizer.instance = NameNormalizer()
        return NameNormalizer.instance

def normalize(name):
    return NameNormalizer.getInstance().normalize(name)
    

def load_names_and_bvown():
    name2bvdid = pd.read_csv("../datasets/common_data/orbis_patents/Orbis_patents_2017_company_names.txt",sep="\t")
    bvown = pd.read_csv("../datasets/common_data/orbis_patents/Orbis_patents_2017_DUO_GUO.csv")
    bvdid_with_name = set(name2bvdid.BvDIDnumber.unique())
    bvown = bvown.loc[bvown.bvdidnumber.transform(lambda x:x in bvdid_with_name)]
    bvown = bvown.loc[(~bvown.DUO_bvdid.isna()) | (~bvown.GUO_bvdid.isna())]

    del bvown["merged"]
    return (name2bvdid,bvown)

def load_panel():
    (name2bvdid,bvown) = load_names_and_bvown()
    bvdid2name = name2bvdid.groupby("BvDIDnumber").agg({'CompanyName':'first'})
    bv2 = bvown.copy()
    bv2 = bv2.merge(bvdid2name,left_on = "bvdidnumber",right_index = True,indicator=False)
    bv2.drop_duplicates("bvdidnumber",inplace=True)
    bv2.set_index('bvdidnumber',inplace=True)
    return bv2


def aggregate_groups(firms,patents=True):
    "Aggregates from firm level to group-level data"
    if patents:
        groups = firms.groupby("GUO_bvdid").agg({'total_any_tri_1980':'sum','total_any_bi_1980':'sum','bvdidnumber' : 'count'})
    else:
        groups = firms.groupby("GUO_bvdid").agg({'bvdidnumber' : 'count'})
        
    groups.rename(columns={"bvdidnumber":"num_firms"},inplace=True)
    groups = groups.reset_index()
    return groups

def merge_firms(firms):
    """Merge withs within the same conglomerate which end up having the same normalized company name"""
    firms = firms.reset_index()
    print("Normalizing company names...")
    firms["CompanyName_normalized"] = firms.CompanyName.transform(normalize)

    groups = aggregate_groups(firms,patents=False)
    
    groups = groups.loc[groups.num_firms > 1]
    
    conglomerates = list(groups.GUO_bvdid)
    conglomerates.sort()
    
    transform = []

    for owner in conglomerates:
        group = firms.loc[firms.GUO_bvdid == owner]
        gfirms = group.groupby("CompanyName_normalized").bvdidnumber.agg(lambda x : list(x))
        
        gfirms = pd.DataFrame({'firms':gfirms})
        gfirms["num"] = gfirms.firms.transform(len)
        gfirms_merge = gfirms.loc[gfirms.num > 1]

        for firm_set in gfirms_merge.firms:
            main = firm_set[0]
            for firm in firm_set[1:]:
                transform.append([firm,main])


    print("Suggesting to do {} merges.".format(len(transform)))

    df = pd.DataFrame(transform)
    df.columns = ("bvdid_source","bvdid_target")
    print(df)
    return df
                    
        
if __name__ == "__main__":

    task = sys.argv[1] if len(sys.argv)>1 else ""

    if task=="merge_firms":
        firms = load_panel()
        df = merge_firms(firms)
        df.to_csv("../datasets/common_data/orbis_patents/firm_merge_map.csv",index=False)


    else:
        raise Exception("No task specified!")    

